# -*- coding:UTF-8 -*-
"""
@Project:   DataCrawler
@FileName:  beautifulsoup.py 
@CreateDate:2023/4/27 17:31  
@Author:    Jia  
@Desc:      beautifulsoup的用法示例
"""
from bs4 import BeautifulSoup
import re


# 使用 html.parser解析器，还有lxml解析器
html_text = open('./Data/res_text.html', encoding='utf-8')
soup = BeautifulSoup(html_text, 'html.parser', from_encoding='utf-8')

# 输出soup对象中所有标签名为"title"的标签
print(soup.findAll("title"))

# 输出soup对象中**所有**属性为"class"属性值为“sister”的标签
print(soup.findAll("",attrs={"class" : "sister"}))

# 输出soup对象中**所有**属性为“class”属性值为“story”或“title”或“sister”的标签
print(soup.findAll("",attrs={"class":{"story","title","sister"}}))

soup.findAll("",text = "The Dormouse's story")
soup.find_all(text=['plants', 'algae'])

# regexp 表达式对象
emailid_regexp = re.compile(r"\w+@\w+\.\w+")
first_email_id = soup.find(text=emailid_regexp)

